Part 1

Task A

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     ✔ tidyr   1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(ggplot2)
library(imputeTS)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
?read.csv2()
elhub <- read.csv2("consumption_per_group_aas_hour.csv")
elhub <- elhub %>%
  mutate(STARTTID = ymd_hms(STARTTID, tz = "Europe/Oslo"))
## Date in ISO8601 format; converting timezone from UTC to "Europe/Oslo".

We find that the easiest way to handle the time zone was to mutate the STARTTID and setting the time zone to be in Europe, Oslo. Had a bit trouble with the locale argument.

elhub <- elhub %>%
  dplyr::select(STARTTID, FORBRUKSGRUPPE, VOLUM_KWH)
first_complete_row <- which(complete.cases(elhub))[1]
elhub <- elhub[first_complete_row:nrow(elhub), ]

Trimmed the data to start from the first complete row.

daily_data <- elhub %>%
  mutate(date = floor_date(STARTTID, "day")) %>%
  group_by(date, FORBRUKSGRUPPE) %>%
  summarise(daily_kwh = sum(VOLUM_KWH, na.rm = TRUE))
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
# Dates for DST changes 
dst_start <- ymd_hms("2021-03-28 02:00:00", tz = "Europe/Oslo")
## Warning: 1 failed to parse.
dst_end <- ymd_hms("2021-10-31 02:00:00", tz = "Europe/Oslo")

dst_start_check <- elhub %>%
  filter(STARTTID >= dst_start - hours(2) & STARTTID <= dst_start + hours(12))

dst_end_check <- elhub %>%
  filter(STARTTID >= dst_end - hours(1) & STARTTID <= dst_end + hours(1))

print("DST Start Transition (March)")
## [1] "DST Start Transition (March)"
print(dst_start_check)
## [1] STARTTID       FORBRUKSGRUPPE VOLUM_KWH     
## <0 rows> (or 0-length row.names)
print("DST End Transition (October)")
## [1] "DST End Transition (October)"
print(dst_end_check)
##               STARTTID FORBRUKSGRUPPE VOLUM_KWH
## 1  2021-10-31 01:00:00     Forretning  8719.900
## 2  2021-10-31 01:00:00       Industri  2267.775
## 3  2021-10-31 01:00:00         Privat 10941.377
## 4  2021-10-31 02:00:00     Forretning  8604.830
## 5  2021-10-31 02:00:00       Industri  2189.055
## 6  2021-10-31 02:00:00         Privat 10347.846
## 7  2021-10-31 02:00:00     Forretning  8541.550
## 8  2021-10-31 02:00:00       Industri  2174.467
## 9  2021-10-31 02:00:00         Privat  9953.731
## 10 2021-10-31 03:00:00     Forretning  8560.082
## 11 2021-10-31 03:00:00       Industri  2148.565
## 12 2021-10-31 03:00:00         Privat  9878.763
dst_start_day_check <- elhub %>%
  filter(STARTTID >= ymd_hms("2021-03-28 00:00") & STARTTID < ymd_hms("2021-03-29 00:00"))
## Warning: There were 2 warnings in `filter()`.
## The first warning was:
## ℹ In argument: `STARTTID >= ymd_hms("2021-03-28 00:00") & STARTTID <
##   ymd_hms("2021-03-29 00:00")`.
## Caused by warning:
## ! All formats failed to parse. No formats found.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
print("Entire Day Check for DST Start (March 28)")
## [1] "Entire Day Check for DST Start (March 28)"
print(dst_start_day_check)
## [1] STARTTID       FORBRUKSGRUPPE VOLUM_KWH     
## <0 rows> (or 0-length row.names)

Data starts at 2021-04-01, which is obviously after the last Sunday in March. The last Sunday in October makes perfect sense, with duplicate times of 02:00 with different values.

The last Sunday in March we move forward by one hour at 02.00, skipping the 02.00-02.59. This should leave us with a missing our on this day. The last Sunday in October we move backwards by one hour, at 03.00, duplicating the hour 02.00-02.59. This should leave us with a duplicated hour in our data. When using hourly data, this could increase the complexity of our analysis and the models sensitivity, but when we aggregate our hourly data into daily data, the impact of DST will be minimized.

Task B

library(readxl)

load_and_process_file <- function(file_path) {
  data <- read_excel(file_path)
  
  data <- data %>%
  select(DATO, LT, GLOB) %>%
  mutate(DATO = as.POSIXct(DATO))
         
  data$DATO <- floor_date(data$DATO, unit = "day")    
  return(data)
}

Comment: We had trouble understanding the connection between the dates (what we should do) and the assignment. Therefor we adjusted the dates to only contain the date without the hms.

file_paths <- list.files(pattern = "Aas dogn .*\\.xlsx", full.names = TRUE)

all_data <- do.call(rbind, lapply(file_paths, load_and_process_file))
## New names:
## • `` -> `...30`
# Define the range of dates you expect
start_date <- as.Date("2017-01-01")
end_date <- as.Date("2024-12-31")
expected_dates <- seq.Date(start_date, end_date, by = "day")

# Filter for leap year dates
leap_year_dates <- expected_dates[month(expected_dates) == 2 & day(expected_dates) == 29]

Leap year adjustments ensure consistency across non-leap years, helping to avoid mismatches when modeling or analyzing yearly seasonal patterns.

all_data <- all_data[order(all_data$DATO),]
na_count <- sapply(all_data, function(x) sum(is.na(x)))
na_count
## DATO   LT GLOB 
##    0    0    8
ggplot(all_data, aes(x = DATO, y = GLOB)) +
  geom_line() +
  labs(title = "Time Series of GLOB Values", x = "Date", y = "GLOB") +
  theme_minimal()

all_data_imputed <- na_kalman(all_data, model = "auto.arima")
ggplot_na_imputations(all_data$GLOB, all_data_imputed)

We used imputeTS cheat sheet to help us find a imputation method. Kalman Smoothing respects the seasonality and trend which we figured to be important in our case. We tried some different methods but ended up with a nice looking imputation from the Kalman, and stuck whit that. The Kalman Smoothing is a dynamic model which adapts to changing patterns. It also separates signal from noise, improving accuracy.

Task C

range_of_daily_data <- c(min(daily_data$date), max(daily_data$date))
range_of_daily_data
## [1] "2021-04-01 CEST" "2024-09-30 CEST"
range_of_all_data <- c(min(all_data$DATO), max(all_data$DATO))
range_of_all_data
## [1] "2017-01-01 UTC" "2024-10-13 UTC"

Merge the two datasets beginning in 2021-04-01 to 2024-09-30.

daily_data_wide <- daily_data %>%
  pivot_wider(names_from = FORBRUKSGRUPPE, values_from = daily_kwh)

# date range (without time zone)
start_date <- "2021-04-30"
end_date <- "2024-09-30"

# Filter all_data for the specified date range
filtered_all_data <- all_data_imputed %>%
  filter(DATO >= start_date & DATO <= end_date)

# Filter daily_data_wide for the specified date range
filtered_daily_data <- daily_data_wide %>%
  filter(date >= start_date & date <= end_date)

# Ensure both date columns are in Date class
filtered_all_data <- filtered_all_data %>%
  mutate(DATO = as.Date(DATO))

filtered_daily_data <- filtered_daily_data %>%
  mutate(date = as.Date(date))

# Merge the filtered datasets on the specified date columns using full_join
merged_data <- filtered_all_data %>%
  full_join(filtered_daily_data, by = c("DATO" = "date"))
head(merged_data)
# Filter out leap day (February 29)
merged_data_no_leap <- merged_data %>%
  filter(!(format(DATO, "%m-%d") == "02-29"))
missing_in_daily_data <- setdiff(filtered_all_data$DATO, filtered_daily_data$date)
missing_in_all_data <- setdiff(filtered_daily_data$date, filtered_all_data$DATO)

print(missing_in_daily_data)
## numeric(0)
print(missing_in_all_data)
##   [1] 19358 19359 19360 19361 19362 19363 19364 19365 19366 19367 19368 19369
##  [13] 19370 19371 19372 19373 19374 19375 19376 19377 19378 19379 19380 19381
##  [25] 19382 19383 19384 19385 19386 19387 19388 19389 19390 19391 19392 19393
##  [37] 19394 19395 19396 19397 19398 19399 19400 19401 19402 19403 19404 19405
##  [49] 19406 19407 19408 19409 19410 19411 19412 19413 19414 19415 19416 19417
##  [61] 19418 19419 19420 19421 19422 19423 19424 19425 19426 19427 19428 19429
##  [73] 19430 19431 19432 19433 19434 19435 19436 19437 19438 19439 19440 19441
##  [85] 19442 19443 19444 19445 19446 19447 19448 19449 19450 19451 19452 19453
##  [97] 19454 19455 19456 19457 19458 19459 19460 19461 19462 19463 19464 19465
## [109] 19466 19467 19468 19469 19470 19471 19472 19473 19474 19475 19476 19477
## [121] 19478 19479 19480 19481 19482 19483 19484 19485 19486 19487 19488 19489
## [133] 19490 19491 19492 19493 19494 19495 19496 19497 19498 19499 19500 19501
## [145] 19502 19503 19504 19505 19506 19507 19508 19509 19510 19511 19512 19513
## [157] 19514 19515 19516 19517 19518 19519 19520 19521 19522 19523 19524 19525
## [169] 19526 19527 19528 19529 19530 19531 19532 19533 19534 19535 19536 19537
## [181] 19538 19539 19540 19541 19542 19543 19544 19545 19546 19547 19548 19549
## [193] 19550 19551 19552 19553 19554 19555 19556 19557 19558 19559 19560 19561
## [205] 19562 19563 19564 19565 19566 19567 19568 19569 19570 19571 19572 19573
## [217] 19574 19575 19576 19577 19578 19579 19580 19581 19582 19583 19584 19585
## [229] 19586 19587 19588 19589 19590 19591 19592 19593 19594 19595 19596 19597
## [241] 19598 19599 19600 19601 19602 19603 19604 19605 19606 19607 19608 19609
## [253] 19610 19611 19612 19613 19614 19615 19616 19617 19618 19619 19620 19621
## [265] 19622 19623 19624 19625 19626 19627 19628 19629 19630 19631 19632 19633
## [277] 19634 19635 19636 19637 19638 19639 19640 19641 19642 19643 19644 19645
## [289] 19646 19647 19648 19649 19650 19651 19652 19653 19654 19655 19656 19657
## [301] 19658 19659 19660 19661 19662 19663 19664 19665 19666 19667 19668 19669
## [313] 19670 19671 19672 19673 19674 19675 19676 19677 19678 19679 19680 19681
## [325] 19682 19683 19684 19685 19686 19687 19688 19689 19690 19691 19692 19693
## [337] 19694 19695 19696 19697 19698 19699 19700 19701 19702 19703 19704 19705
## [349] 19706 19707 19708 19709 19710 19711 19712 19713 19714 19715 19716 19717
## [361] 19718 19719 19720 19721 19722 19801 19824
missing_in_daily_data <- as.Date(missing_in_daily_data, origin = "1970-01-01")
missing_in_all_data <- as.Date(missing_in_all_data, origin = "1970-01-01")

print(missing_in_daily_data)
## Date of length 0
print(missing_in_all_data)
##   [1] "2023-01-01" "2023-01-02" "2023-01-03" "2023-01-04" "2023-01-05"
##   [6] "2023-01-06" "2023-01-07" "2023-01-08" "2023-01-09" "2023-01-10"
##  [11] "2023-01-11" "2023-01-12" "2023-01-13" "2023-01-14" "2023-01-15"
##  [16] "2023-01-16" "2023-01-17" "2023-01-18" "2023-01-19" "2023-01-20"
##  [21] "2023-01-21" "2023-01-22" "2023-01-23" "2023-01-24" "2023-01-25"
##  [26] "2023-01-26" "2023-01-27" "2023-01-28" "2023-01-29" "2023-01-30"
##  [31] "2023-01-31" "2023-02-01" "2023-02-02" "2023-02-03" "2023-02-04"
##  [36] "2023-02-05" "2023-02-06" "2023-02-07" "2023-02-08" "2023-02-09"
##  [41] "2023-02-10" "2023-02-11" "2023-02-12" "2023-02-13" "2023-02-14"
##  [46] "2023-02-15" "2023-02-16" "2023-02-17" "2023-02-18" "2023-02-19"
##  [51] "2023-02-20" "2023-02-21" "2023-02-22" "2023-02-23" "2023-02-24"
##  [56] "2023-02-25" "2023-02-26" "2023-02-27" "2023-02-28" "2023-03-01"
##  [61] "2023-03-02" "2023-03-03" "2023-03-04" "2023-03-05" "2023-03-06"
##  [66] "2023-03-07" "2023-03-08" "2023-03-09" "2023-03-10" "2023-03-11"
##  [71] "2023-03-12" "2023-03-13" "2023-03-14" "2023-03-15" "2023-03-16"
##  [76] "2023-03-17" "2023-03-18" "2023-03-19" "2023-03-20" "2023-03-21"
##  [81] "2023-03-22" "2023-03-23" "2023-03-24" "2023-03-25" "2023-03-26"
##  [86] "2023-03-27" "2023-03-28" "2023-03-29" "2023-03-30" "2023-03-31"
##  [91] "2023-04-01" "2023-04-02" "2023-04-03" "2023-04-04" "2023-04-05"
##  [96] "2023-04-06" "2023-04-07" "2023-04-08" "2023-04-09" "2023-04-10"
## [101] "2023-04-11" "2023-04-12" "2023-04-13" "2023-04-14" "2023-04-15"
## [106] "2023-04-16" "2023-04-17" "2023-04-18" "2023-04-19" "2023-04-20"
## [111] "2023-04-21" "2023-04-22" "2023-04-23" "2023-04-24" "2023-04-25"
## [116] "2023-04-26" "2023-04-27" "2023-04-28" "2023-04-29" "2023-04-30"
## [121] "2023-05-01" "2023-05-02" "2023-05-03" "2023-05-04" "2023-05-05"
## [126] "2023-05-06" "2023-05-07" "2023-05-08" "2023-05-09" "2023-05-10"
## [131] "2023-05-11" "2023-05-12" "2023-05-13" "2023-05-14" "2023-05-15"
## [136] "2023-05-16" "2023-05-17" "2023-05-18" "2023-05-19" "2023-05-20"
## [141] "2023-05-21" "2023-05-22" "2023-05-23" "2023-05-24" "2023-05-25"
## [146] "2023-05-26" "2023-05-27" "2023-05-28" "2023-05-29" "2023-05-30"
## [151] "2023-05-31" "2023-06-01" "2023-06-02" "2023-06-03" "2023-06-04"
## [156] "2023-06-05" "2023-06-06" "2023-06-07" "2023-06-08" "2023-06-09"
## [161] "2023-06-10" "2023-06-11" "2023-06-12" "2023-06-13" "2023-06-14"
## [166] "2023-06-15" "2023-06-16" "2023-06-17" "2023-06-18" "2023-06-19"
## [171] "2023-06-20" "2023-06-21" "2023-06-22" "2023-06-23" "2023-06-24"
## [176] "2023-06-25" "2023-06-26" "2023-06-27" "2023-06-28" "2023-06-29"
## [181] "2023-06-30" "2023-07-01" "2023-07-02" "2023-07-03" "2023-07-04"
## [186] "2023-07-05" "2023-07-06" "2023-07-07" "2023-07-08" "2023-07-09"
## [191] "2023-07-10" "2023-07-11" "2023-07-12" "2023-07-13" "2023-07-14"
## [196] "2023-07-15" "2023-07-16" "2023-07-17" "2023-07-18" "2023-07-19"
## [201] "2023-07-20" "2023-07-21" "2023-07-22" "2023-07-23" "2023-07-24"
## [206] "2023-07-25" "2023-07-26" "2023-07-27" "2023-07-28" "2023-07-29"
## [211] "2023-07-30" "2023-07-31" "2023-08-01" "2023-08-02" "2023-08-03"
## [216] "2023-08-04" "2023-08-05" "2023-08-06" "2023-08-07" "2023-08-08"
## [221] "2023-08-09" "2023-08-10" "2023-08-11" "2023-08-12" "2023-08-13"
## [226] "2023-08-14" "2023-08-15" "2023-08-16" "2023-08-17" "2023-08-18"
## [231] "2023-08-19" "2023-08-20" "2023-08-21" "2023-08-22" "2023-08-23"
## [236] "2023-08-24" "2023-08-25" "2023-08-26" "2023-08-27" "2023-08-28"
## [241] "2023-08-29" "2023-08-30" "2023-08-31" "2023-09-01" "2023-09-02"
## [246] "2023-09-03" "2023-09-04" "2023-09-05" "2023-09-06" "2023-09-07"
## [251] "2023-09-08" "2023-09-09" "2023-09-10" "2023-09-11" "2023-09-12"
## [256] "2023-09-13" "2023-09-14" "2023-09-15" "2023-09-16" "2023-09-17"
## [261] "2023-09-18" "2023-09-19" "2023-09-20" "2023-09-21" "2023-09-22"
## [266] "2023-09-23" "2023-09-24" "2023-09-25" "2023-09-26" "2023-09-27"
## [271] "2023-09-28" "2023-09-29" "2023-09-30" "2023-10-01" "2023-10-02"
## [276] "2023-10-03" "2023-10-04" "2023-10-05" "2023-10-06" "2023-10-07"
## [281] "2023-10-08" "2023-10-09" "2023-10-10" "2023-10-11" "2023-10-12"
## [286] "2023-10-13" "2023-10-14" "2023-10-15" "2023-10-16" "2023-10-17"
## [291] "2023-10-18" "2023-10-19" "2023-10-20" "2023-10-21" "2023-10-22"
## [296] "2023-10-23" "2023-10-24" "2023-10-25" "2023-10-26" "2023-10-27"
## [301] "2023-10-28" "2023-10-29" "2023-10-30" "2023-10-31" "2023-11-01"
## [306] "2023-11-02" "2023-11-03" "2023-11-04" "2023-11-05" "2023-11-06"
## [311] "2023-11-07" "2023-11-08" "2023-11-09" "2023-11-10" "2023-11-11"
## [316] "2023-11-12" "2023-11-13" "2023-11-14" "2023-11-15" "2023-11-16"
## [321] "2023-11-17" "2023-11-18" "2023-11-19" "2023-11-20" "2023-11-21"
## [326] "2023-11-22" "2023-11-23" "2023-11-24" "2023-11-25" "2023-11-26"
## [331] "2023-11-27" "2023-11-28" "2023-11-29" "2023-11-30" "2023-12-01"
## [336] "2023-12-02" "2023-12-03" "2023-12-04" "2023-12-05" "2023-12-06"
## [341] "2023-12-07" "2023-12-08" "2023-12-09" "2023-12-10" "2023-12-11"
## [346] "2023-12-12" "2023-12-13" "2023-12-14" "2023-12-15" "2023-12-16"
## [351] "2023-12-17" "2023-12-18" "2023-12-19" "2023-12-20" "2023-12-21"
## [356] "2023-12-22" "2023-12-23" "2023-12-24" "2023-12-25" "2023-12-26"
## [361] "2023-12-27" "2023-12-28" "2023-12-29" "2023-12-30" "2023-12-31"
## [366] "2024-03-19" "2024-04-11"

Had trouble with the previous dates used. 2021-04-05/2021-04-29 suddenly gave missing values for the daily_data part of the merged data. Changing the start date to 2021-04-30 to avoid trouble.

merged_data_no_leap_imputed <- merged_data_no_leap %>%
  mutate(across(c(GLOB, LT), ~ na_kalman(.)))

Imputed a few missing values.

ggplot_na_distribution(merged_data_no_leap_imputed)

merged_data_no_leap_imputed <- merged_data_no_leap_imputed %>%
  arrange(DATO) %>%       
  distinct(DATO, .keep_all = TRUE)  

Arranged the correct order of dates, and removed duplicates.

Part 2

Task D

library(ggplot2)
library(tidyr)

# Convert data to long format for easier plotting of multiple variables
merged_data_long <- merged_data_no_leap_imputed %>%
  pivot_longer(cols = c(Forretning, Industri, Privat), 
               names_to = "Measurement", values_to = "Value")

ggplot(merged_data_long, aes(x = DATO, y = Value, color = Measurement)) +
  geom_line() +
  labs(title = "Raw Data Visualization for Measurements",
       x = "Date",
       y = "Value") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

They all peak in the winter season, and are at its lowest in the summer season. Looks like it is possible to find a increasing trend in ‘Privat’, while the two other looks more stable.

merged_data_long2 <- merged_data_no_leap_imputed %>%
  pivot_longer(cols = c(LT, GLOB), 
               names_to = "Measurement", values_to = "Value")

ggplot(merged_data_long2, aes(x = DATO, y = Value, color = Measurement)) +
  geom_line() +
  labs(title = "Raw Data Visualization for Measurements",
       x = "Date",
       y = "Value") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

We find seasonality in this plot as well, with a opposite seasonality where it is at its lowest in the winter and highest in the summer season. The air temperature (LT) seems to get lower each year, and in the previous plot, we could see that the private usage of electricity increased each year; we might have a strong correlation here. The global radiation (GLOB) looks stable and we see that we have the most radiation in the summer season.

min_max_scale <- function(x) {
  (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}

# min-max scaling
merged_data_long <- merged_data_no_leap_imputed %>%
  pivot_longer(cols = c(LT, GLOB, Forretning, Industri, Privat),
               names_to = "Measurement", values_to = "Value") %>%
  group_by(Measurement) %>%
  mutate(Value_scaled = min_max_scale(Value))

ggplot(merged_data_long, aes(x = DATO, y = Value_scaled, color = Measurement)) +
  geom_line() +
  labs(title = "Scaled Data Visualization for Measurements",
       x = "Date", y = "Scaled Value (0-1)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The data scaled between 0 and 1. Notice the crossing of seasonal trends.

duplicates_in_data <- merged_data_no_leap_imputed %>%
  filter(duplicated(DATO) | duplicated(DATO, fromLast = TRUE))
print(duplicates_in_data)
## # A tibble: 0 × 6
## # ℹ 6 variables: DATO <date>, LT <dbl>, GLOB <dbl>, Forretning <dbl>,
## #   Industri <dbl>, Privat <dbl>

2024-03-18 and 2024-04-02 have duplicates.

merged_data_clean <- merged_data_no_leap_imputed %>%
  distinct(DATO, .keep_all = TRUE)
library(feasts)
## Loading required package: fabletools
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
library(tsibble)
## 
## Attaching package: 'tsibble'
## The following object is masked from 'package:lubridate':
## 
##     interval
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
merged_tsibble <- merged_data_clean %>%
  as_tsibble(index = DATO)

In the KPSS test, p-value over 0.05 indicates stationarity and if its under 0.05, it indicates non-stationarity.

# KPSS on GLOB
kpss_result_glob <- merged_tsibble %>%
  features(GLOB, unitroot_kpss)
print("KPSS Test for GLOB:")
## [1] "KPSS Test for GLOB:"
print(kpss_result_glob)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1      1.08        0.01

p-value ≥ 0.05 -> indicating stationarity for GLOB.

# KPSS on LT
kpss_result_lt <- merged_tsibble %>%
  features(LT, unitroot_kpss)
print("KPSS Test for LT:")
## [1] "KPSS Test for LT:"
print(kpss_result_lt)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1     0.786        0.01

p-value < 0.05 -> indicating non-stationarity for LT.

# KPSS on Forretning 
kpss_result_forretning <- merged_tsibble %>%
  features(Forretning, unitroot_kpss)
print("KPSS Test for Forretning:")
## [1] "KPSS Test for Forretning:"
print(kpss_result_forretning)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1     0.993        0.01

p-value < 0.05 -> indicating non-stationarity for Forretning.

# KPSS on Industri
kpss_result_industri <- merged_tsibble %>%
  features(Industri, unitroot_kpss)
print("KPSS Test for Industri:")
## [1] "KPSS Test for Industri:"
print(kpss_result_industri)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1     0.571      0.0257

p-value < 0.05 -> indicating non-stationarity for Industri.

# KPSS on Privat 
kpss_result_privat <- merged_tsibble %>%
  features(Privat, unitroot_kpss)
print("KPSS Test for Privat:")
## [1] "KPSS Test for Privat:"
print(kpss_result_privat)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1     0.724      0.0114

p-value < 0.05 -> indicating non-stationarity for Privat.

which(is.na(merged_data_clean))
## integer(0)
merged_data_clean[which(is.na(merged_data_clean)), ]
merged_data_clean <- merged_data_clean %>% na.omit()
# function to calculate max cross-correlation for each pair
cross_correlation_matrix <- function(data) {
  variable_names <- colnames(data)
  results <- expand.grid(var1 = variable_names, var2 = variable_names)
  results <- results %>% filter(var1 != var2)

  results <- results %>%
    rowwise() %>%
    mutate(
      max_ccf = max(abs(ccf(data[[var1]], data[[var2]], plot = FALSE)$acf), na.rm = TRUE),
      max_lag = which.max(abs(ccf(data[[var1]], data[[var2]], plot = FALSE)$acf)) - 1
    )

  return(results)
}
merged_data_values <- merged_data_clean %>%
  select(-DATO)
ccf_results <- cross_correlation_matrix(merged_data_values)
print(ccf_results)
## # A tibble: 20 × 4
## # Rowwise: 
##    var1       var2       max_ccf max_lag
##    <fct>      <fct>        <dbl>   <dbl>
##  1 GLOB       LT           0.725       7
##  2 Forretning LT           0.680      26
##  3 Industri   LT           0.666      26
##  4 Privat     LT           0.805      26
##  5 LT         GLOB         0.725      47
##  6 Forretning GLOB         0.524      48
##  7 Industri   GLOB         0.554      48
##  8 Privat     GLOB         0.648      54
##  9 LT         Forretning   0.680      28
## 10 GLOB       Forretning   0.524       6
## 11 Industri   Forretning   0.963      27
## 12 Privat     Forretning   0.823      27
## 13 LT         Industri     0.666      28
## 14 GLOB       Industri     0.554       6
## 15 Forretning Industri     0.963      27
## 16 Privat     Industri     0.816      27
## 17 LT         Privat       0.805      28
## 18 GLOB       Privat       0.648       0
## 19 Forretning Privat       0.823      27
## 20 Industri   Privat       0.816      27
ggplot(ccf_results, aes(x = var1, y = var2, fill = max_ccf)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0.5) +
  labs(title = "Cross-Correlation Between Variables",
       x = "", y = "", fill = "Max CCF") +
  theme_minimal()

From this cross-correlation, the assumption that Privat and LT would have a strong correlation hold to be true. Forretning and Industri also have a strong correlation. GLOB have low correlation with all the variables.

plot_acf_pacf_short <- function(data, column_name) {
  cat("Plotting ACF and PACF for", column_name, "short-term (14 days)\n")
  acf(data[[column_name]], lag.max = 14, main = paste("ACF of", column_name, "(Short-term)"))
  pacf(data[[column_name]], lag.max = 14, main = paste("PACF of", column_name, "(Short-term)"))
}

plot_acf_pacf_long <- function(data, column_name) {
  cat("Plotting ACF and PACF for", column_name, "long-term (2 years)\n")
  acf(data[[column_name]], lag.max = 730, main = paste("ACF of", column_name, "(Long-term)"))
  pacf(data[[column_name]], lag.max = 730, main = paste("PACF of", column_name, "(Long-term)"))
}
columns_to_analyze <- c("GLOB", "LT", "Forretning", "Industri", "Privat")
for (col in columns_to_analyze) {
  plot_acf_pacf_short(merged_data_clean, col)
  plot_acf_pacf_long(merged_data_clean, col)
}
## Plotting ACF and PACF for GLOB short-term (14 days)

## Plotting ACF and PACF for GLOB long-term (2 years)

## Plotting ACF and PACF for LT short-term (14 days)

## Plotting ACF and PACF for LT long-term (2 years)

## Plotting ACF and PACF for Forretning short-term (14 days)

## Plotting ACF and PACF for Forretning long-term (2 years)

## Plotting ACF and PACF for Industri short-term (14 days)

## Plotting ACF and PACF for Industri long-term (2 years)

## Plotting ACF and PACF for Privat short-term (14 days)

## Plotting ACF and PACF for Privat long-term (2 years)

#### GLOB:

When we look at the short term ACF, it decays slowly which suggests non-stationarity; indicating a trend of seasonality in the data. All the bars are above the blue confidence interval lines, which suggest that the the series GLOB has strong autocorrelation in the short term. The PACF has little to no significant correlation after accounting for the high correlation in the first lag. After lag 7, all the partial autocorrelation values stays close to the blue lines. Since it has a drop-off after lag 1, we can assume that the series can be modeled as an autoregressive process of order 1. This PACF plot shows an decay in the correlation, and could suggest that the series is stationary, opposing the ACF plot (both for short term).

In the long term ACF, the sinusoidal shape suggests that series show a strong seasonal component. From the PACF again supports AR(1) model, and the decays of autocorrelation and the fact that most of the bars are between the blue lines with a few exceptions, could indicate stationarity. The KPSS done above showed a p-value ≥ 0.05 which indicates stationarity for GLOB.

LT:

The ACF of LT resembles that of GLOB, showing a slow decay. This slow decay suggests non-stationarity in the data, meaning that the lags are highly correlated with their preceding values in the short term. The LT variable has a strong short term dependency on it immediate past value which is visible in the PACF plot. Beyond lag 1, it does not exhibit any strong dependencies, only a few outside of the confidence interval. Here we could assume some stationarity in the data.

From the long term plot of ACF, the sinusoidal shape, as we also had for GLOB, suggest a strong seasonal component. The PACF decays fast and the bars mostly stays inside the confidence interval, which could lead us to this that the LT series is stationary. The KPSS tells us otherwise; it gave an p-value < 0.05, which indicates non-stationarity for LT. This mismatch could have something to do with a trend and seasonal components. From the plot of the LT series above, there was a slight decay each year in the winter season, which could strengthen out thoughts of a trend.

Forretning:

ACF plot suggest that Forretning series has both short term correlation with recent values and a periodic component. The plot decays a bit but rises fast again and repeats. This could indicate the periodicity. Since the ACF does not quickly drop down to zero, it suggest non-stationarity. It also looks like the period the ACF caches is weekly, since the autocorrelation is at it highest every 7th day. In the PACF lag 1, 3, 5, 6, 7 and 8 looks to be influenced by values a few days back.

In the long term the ACF plot shows cyclical behavior with the same sinusoidal pattern as the previous variables. It also decays slowly which is typical for non-stationary series, and the KPSS also suggested non-stationarity. The small spikes in the pattern could be noise or some irregular pattern, or some what more complex seasonality; possible monthly or weekly seasonality. PACF show significant partial autocorrelation in the short term with its initial lags. Past these initial lags, where it dampens, all the lags stays within the confidence interval. Could have some stationarity but the KPSS tells us otherwise.

Industri:

The ACF and PACF in the short term for Industri, resembles the plots for Forretning. In the cross-correlation matrix, these two variables had a very high correlation at around 0.96. For the long term it is also very close to Forretning.

Privat:

ACF of Privat is bit different from the two previous. The lags decays really slow and show high dependence from the previous days for the short term. The PACF has a strong initial lag, indicating that the immediate previous value has strong influence on the current value. Beyond lag 1, the lags have small partial autocorrelation, mostly inside the confidence interval, with a few right on the outside.

In the long term, the ACF shows the same sinusoidal pattern as the other variables. In contrast to the spikes that clearly showed in Forretning and Industri, this plot seems to more smooth. This could indicate that there only on seasonal component. It is also decaying suggesting non-stationarity. The PACF has on initial strong lag, and the drops quickly, keeping most of the lags within the confidence interval. The KPSS suggested non-stationarity, even though we might find some stationarity from the PACF, but not from the ACF.

Task E

seasonally_differenced_data <- merged_tsibble %>%
  mutate(across(c(GLOB, LT, Forretning, Industri, Privat),
                ~ difference(.x, lag = 365))) %>%
  filter(!is.na(rowSums(across(c(GLOB, LT, Forretning, Industri, Privat)))))
for (col in columns_to_analyze) {
  plot_acf_pacf_short(seasonally_differenced_data, col)
  plot_acf_pacf_long(seasonally_differenced_data, col)
}
## Plotting ACF and PACF for GLOB short-term (14 days)

## Plotting ACF and PACF for GLOB long-term (2 years)

## Plotting ACF and PACF for LT short-term (14 days)

## Plotting ACF and PACF for LT long-term (2 years)

## Plotting ACF and PACF for Forretning short-term (14 days)

## Plotting ACF and PACF for Forretning long-term (2 years)

## Plotting ACF and PACF for Industri short-term (14 days)

## Plotting ACF and PACF for Industri long-term (2 years)

## Plotting ACF and PACF for Privat short-term (14 days)

## Plotting ACF and PACF for Privat long-term (2 years)

Discussion and Comparison of the Correlation Analysis

GLOB:

The bars in the ACF plot of GLOB decays quickly and stays between the blue confidence interval lines. This indicates a short term correlation with the immediate preceding values. Compared to the original data, this seems to be more stationary in the short term, and less correlated after a few lags. The PACF drops a bit faster after the differencing, and stays withing the confidence interval, suggesting stationarity. For the long term, the lag gives high correlation as expected from all the previous plots done in this analyze. Theirs a few lags outside of the confidence interval in the beginning, and also around lag 365, which would be a year. This tells us that the GLOB values are somewhat correlated with the values one year back. In comparison to the original data, we do not have the same seasonal component present with the sinusoidal pattern. The PACF gives quite resembled result as of what the ACF gave.

LT:

In the ACF plot of LT over the short term, it decays smoothly an approaches zero around lag 10-11. In the original data, it decayed much slower. The PACF also seem to drop further down to zero at most lags, still having a strong correlation on 1 lag. In the long term we don’t have the same sinusoidal pattern we had before differencing. A few lags in the beginning which is highly autocorrelated, and a few around a year. There is also some bars outside of the confidence interval just after 100 lags. These could be explained by the seasonal differences temperature. The PACF drops down to around zero right away and seems to be mostly steady within the confidence interval. The plot suggest stationarity.

Forretning:

The first bar in the ACF is close to one, and the next two. highest bars is at lag 7 and 14. This could indicate a weekly period of high autocorrelation. Lag 8 stands out in both this PACF plot and that for the original data. Could be some seasonal component. For the long term, it looks like a dampened seasonal pattern in the ACF plot. It should probably be difference more. The PACF have a few long bars in the beginning both positive and negative, before it quickly drops down within the confidence interval. There is some resemblances between the long term plots for Forretning before and after differencing. The ACF plot exhibits more stationarity now than what it did, even though it still looks to be non-stationary.

Industri:

The ACF and PACF for Industri is almost identical to the ones for Forretning with the same lags standing out. For the long term ACF, there not as much of a seasonal pattern as was exhibited in Forretning, but there is still a clear resemblance. PACF is aslo quite close to Forretnings PACF.

Privat:

Instead of having almost no decay, this time the ACF plot of Privat gives a slow but decaying ACF plot. In short term, it clearly seems to be non-stationary. ACF long term looks to have some seasonal pattern. The PACF looks good, mostly staying within the confidence interval after the first lag with high partial autocorrelation.

Overall, the last three variables could use one more differencing.

Task F

Was struggling with the stl. We forgot to think of the frequency.

convert_to_ts <- function(data, start_year = 2021, start_day = 1, frequency = 365) {
  # Ensure `data` is a data.frame
  data <- as.data.frame(data)
  
  # Initialize an empty list to store time series columns
  ts_list <- list()
  
  # Loop over each column in the data frame
  for (col_name in names(data)) {
    # Check if the column is numeric (to be converted to time series)
    if (is.numeric(data[[col_name]])) {
      # Convert column to time series with specified frequency
      ts_list[[col_name]] <- ts(data[[col_name]], 
                                frequency = frequency, 
                                start = c(start_year, start_day))
    } else {
      # If not numeric, keep the column as it is (e.g., Date column)
      ts_list[[col_name]] <- data[[col_name]]
    }
  }
  
  # Convert the list to a data frame
  ts_data <- as.data.frame(ts_list)
  
  # Return the data frame with time series columns
  return(ts_data)
}
merged_data_ts <- convert_to_ts(merged_data_clean)
perform_stl_on_all_columns <- function(data, date_column = "DATO", frequency = 365) {
  # Ensure `data` is a data.frame
  data <- as.data.frame(data)
  
  # Initialize an empty list to store STL decomposition results
  stl_results <- list()
  
  # Loop over each column in the data frame
  for (col_name in names(data)) {
    # Skip the date column
    if (col_name != date_column && is.numeric(data[[col_name]])) {
      # Convert the column to a time series with the specified frequency
      ts_column <- ts(data[[col_name]], frequency = frequency)
      
      # Perform STL decomposition on the time series column
      stl_result <- stl(ts_column, s.window = "periodic")
      
      # Store the result in the list with the column name as key
      stl_results[[col_name]] <- stl_result
    }
  }
  
  # Return the list of STL decomposition results
  return(stl_results)
}

stl_decompositions <- perform_stl_on_all_columns(merged_data_clean)

“periodic” is ideal for capturing annual seasonality due to the consistent cycle length.

# Define a function to plot and extract STL components for each column in stl_decompositions
plot_and_extract_stl_components <- function(stl_decompositions) {
  components <- list()
  # Loop through each decomposition in the list
  for (col_name in names(stl_decompositions)) {
    
    # Access the STL decomposition for the specific column
    stl_result <- stl_decompositions[[col_name]]
    
    # Plot the decomposition
    cat("Plotting STL decomposition for:", col_name, "\n")
    plot(stl_result, main = paste("STL Decomposition for", col_name))
    
    # Extract individual components
    seasonal_component <- stl_result$time.series[, "seasonal"]
    trend_component <- stl_result$time.series[, "trend"]
    remainder_component <- stl_result$time.series[, "remainder"]
    
    components[[col_name]] <- list(seasonal = seasonal_component,
                                    trend = trend_component,
                                    remainder = remainder_component)
  }
  return(components)
}

list_comps <- plot_and_extract_stl_components(stl_decompositions)
## Plotting STL decomposition for: LT

## Plotting STL decomposition for: GLOB

## Plotting STL decomposition for: Forretning

## Plotting STL decomposition for: Industri

## Plotting STL decomposition for: Privat

# Initialize empty lists to store each component for all variables
seasonal_data <- list()
trend_remainder_data <- list()

# Loop over each variable in list_comps
for (var_name in names(list_comps)) {
  # Extract the components for the current variable
  seasonal_component <- list_comps[[var_name]]$seasonal
  trend_component <- list_comps[[var_name]]$trend
  remainder_component <- list_comps[[var_name]]$remainder
  
  # Store the seasonal component in the seasonal_data list
  seasonal_data[[var_name]] <- seasonal_component
  
  # Store the sum of trend and remainder in the trend_remainder_data list
  trend_remainder_data[[var_name]] <- trend_component + remainder_component
}

# Convert the lists to data frames
seasonal_df <- as.data.frame(seasonal_data)
trend_remainder_df <- as.data.frame(trend_remainder_data)

# Add row names as Date if available (assuming the same length and that you have dates)
# If you have a separate vector of dates, you can add it as a Date column.
dates <- merged_data_ts$DATO # Uncomment and define your dates if needed
seasonal_df$Date <- dates
trend_remainder_df$Date <- dates

# Print or check the data frames
head(seasonal_df)
head(trend_remainder_df)

Task G

Granger Causality Test

Null Hypothesis (H0): X does not Granger-cause Y, meaning past values of X do not provide additional predictive information for Y over Y’s own past values.

Alternative Hypothesis (H1): X Granger-causes Y, indicating past values of X help improve predictions of Y beyond using only Y’s own past values.

An example of Granger: we can’t define that the weather is hot because of an ice cream sale, but we can define ice cream sale because of the weather is hot.

The Granger causality test is performed by comparing two types of models: a restricted model (using only the target variable’s past values) and an unrestricted model (including both the target and predictor variable’s past values). These models rely on autoregressive (AR) and vector autoregressive (VAR) frameworks.

Autoregressive (AR) Models: For a single time series, AR models use past values to predict the current value.

Vector Autoregressive (VAR) Models: For multiple time series, VAR models consider past values of each time series in the system to predict current values of each series.

ts_trend_reminder <- as_tsibble(trend_remainder_df)
## Using `Date` as index variable.
deseasoned_data_ts <- ts(trend_remainder_df, frequency = 365)
original_data_ts <- ts(merged_tsibble, frequency = 365)
library(vars)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following object is masked from 'package:tsibble':
## 
##     index
## The following object is masked from 'package:imputeTS':
## 
##     na.locf
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## 
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
## 
##     boundary
## Loading required package: urca
## Loading required package: lmtest
# For original data
lag_selection_orig <- VARselect(original_data_ts, lag.max = 15, type = "const")
lag_order_orig <- lag_selection_orig$selection["AIC(n)"] 

# For deseasoned data
lag_selection_deseason <- VARselect(deseasoned_data_ts, lag.max = 15, type = "const")
lag_order_deseason <- lag_selection_deseason$selection["AIC(n)"]
var_model_orig <- VAR(original_data_ts, p = lag_order_orig, type = "const")
var_model_deseason <- VAR(deseasoned_data_ts, p = lag_order_deseason, type = "const")
# LT
granger_test_orig <- causality(var_model_orig, cause = "LT") 
print(granger_test_orig)
## $Granger
## 
##  Granger causality H0: LT do not Granger-cause DATO GLOB Forretning
##  Industri Privat
## 
## data:  VAR object var_model_orig
## F-Test = 1.2203, df1 = 75, df2 = 6852, p-value = 0.09585
## 
## 
## $Instant
## 
##  H0: No instantaneous causality between: LT and DATO GLOB Forretning
##  Industri Privat
## 
## data:  VAR object var_model_orig
## Chi-squared = 60.643, df = 5, p-value = 8.951e-12
granger_test_deseason <- causality(var_model_deseason, cause = "LT") 
print(granger_test_deseason)
## $Granger
## 
##  Granger causality H0: LT do not Granger-cause GLOB Forretning Industri
##  Privat Date
## 
## data:  VAR object var_model_deseason
## F-Test = 1.4506, df1 = 70, df2 = 6894, p-value = 0.008555
## 
## 
## $Instant
## 
##  H0: No instantaneous causality between: LT and GLOB Forretning Industri
##  Privat Date
## 
## data:  VAR object var_model_deseason
## Chi-squared = 60.006, df = 5, p-value = 1.212e-11

Granger causality H0: LT do not Granger-cause GLOB Forretning Industri Privat. Original Data: p-value << 0.05, and we reject the H0. This indicates that past values of LT provide significant information that helps predict the other variables. Deseasoned Data: p-value << 0.05, and we reject the H0. This indicates that past values of LT provide significant information that helps predict the other variables.

# GLOB
granger_test_orig <- causality(var_model_orig, cause = "GLOB") 
print(granger_test_orig)
## $Granger
## 
##  Granger causality H0: GLOB do not Granger-cause DATO LT Forretning
##  Industri Privat
## 
## data:  VAR object var_model_orig
## F-Test = 1.5996, df1 = 75, df2 = 6852, p-value = 0.0008204
## 
## 
## $Instant
## 
##  H0: No instantaneous causality between: GLOB and DATO LT Forretning
##  Industri Privat
## 
## data:  VAR object var_model_orig
## Chi-squared = 24.542, df = 5, p-value = 0.0001707
granger_test_deseason <- causality(var_model_deseason, cause = "GLOB") 
print(granger_test_deseason)
## $Granger
## 
##  Granger causality H0: GLOB do not Granger-cause LT Forretning Industri
##  Privat Date
## 
## data:  VAR object var_model_deseason
## F-Test = 1.5939, df1 = 70, df2 = 6894, p-value = 0.001254
## 
## 
## $Instant
## 
##  H0: No instantaneous causality between: GLOB and LT Forretning Industri
##  Privat Date
## 
## data:  VAR object var_model_deseason
## Chi-squared = 30.67, df = 5, p-value = 1.088e-05

Granger causality H0: GLOB do not Granger-cause LT Forretning Industri Privat. Original Data: p-value << 0.05, and we reject the H0. This indicates that past values of GLOB provide significant information that helps predict the other variables. Deseasoned Data: p-value < 0.05, and we reject the H0. This indicates that past values of GLOB provide significant information that helps predict the other variables.

# Forretning 
granger_test_orig <- causality(var_model_orig, cause = "Forretning") 
print(granger_test_orig)
## $Granger
## 
##  Granger causality H0: Forretning do not Granger-cause DATO LT GLOB
##  Industri Privat
## 
## data:  VAR object var_model_orig
## F-Test = 5.1909, df1 = 75, df2 = 6852, p-value < 2.2e-16
## 
## 
## $Instant
## 
##  H0: No instantaneous causality between: Forretning and DATO LT GLOB
##  Industri Privat
## 
## data:  VAR object var_model_orig
## Chi-squared = 508.95, df = 5, p-value < 2.2e-16
granger_test_deseason <- causality(var_model_deseason, cause = "Forretning") 
print(granger_test_deseason)
## $Granger
## 
##  Granger causality H0: Forretning do not Granger-cause LT GLOB Industri
##  Privat Date
## 
## data:  VAR object var_model_deseason
## F-Test = 6.242, df1 = 70, df2 = 6894, p-value < 2.2e-16
## 
## 
## $Instant
## 
##  H0: No instantaneous causality between: Forretning and LT GLOB Industri
##  Privat Date
## 
## data:  VAR object var_model_deseason
## Chi-squared = 495.85, df = 5, p-value < 2.2e-16

Granger causality H0: Forretning do not Granger-cause LT GLOB Industri Privat. Original Data: p-value << 0.05, and we reject the H0. This indicates that past values of Forretning provide significant information that helps predict the other variables. Deseasoned Data: p-value << 0.05, and we reject the H0. This indicates that past values of Forretning provide significant information that helps predict the other variables.

# Industri 
granger_test_orig <- causality(var_model_orig, cause = "Industri") 
print(granger_test_orig)
## $Granger
## 
##  Granger causality H0: Industri do not Granger-cause DATO LT GLOB
##  Forretning Privat
## 
## data:  VAR object var_model_orig
## F-Test = 6.373, df1 = 75, df2 = 6852, p-value < 2.2e-16
## 
## 
## $Instant
## 
##  H0: No instantaneous causality between: Industri and DATO LT GLOB
##  Forretning Privat
## 
## data:  VAR object var_model_orig
## Chi-squared = 463.98, df = 5, p-value < 2.2e-16
granger_test_deseason <- causality(var_model_deseason, cause = "Industri") 
print(granger_test_deseason)
## $Granger
## 
##  Granger causality H0: Industri do not Granger-cause LT GLOB Forretning
##  Privat Date
## 
## data:  VAR object var_model_deseason
## F-Test = 7.6282, df1 = 70, df2 = 6894, p-value < 2.2e-16
## 
## 
## $Instant
## 
##  H0: No instantaneous causality between: Industri and LT GLOB Forretning
##  Privat Date
## 
## data:  VAR object var_model_deseason
## Chi-squared = 444.47, df = 5, p-value < 2.2e-16

Granger causality H0: Industri do not Granger-cause LT GLOB Forretning Privat. Original Data: p-value << 0.05, and we reject the H0. This indicates that past values of Industri provide significant information that helps predict the other variables. Deseasoned Data: p-value << 0.05, and we reject the H0. This indicates that past values of Industri provide significant information that helps predict the other variables.

# Privat
granger_test_orig <- causality(var_model_orig, cause = "Privat") 
print(granger_test_orig)
## $Granger
## 
##  Granger causality H0: Privat do not Granger-cause DATO LT GLOB
##  Forretning Industri
## 
## data:  VAR object var_model_orig
## F-Test = 8.968, df1 = 75, df2 = 6852, p-value < 2.2e-16
## 
## 
## $Instant
## 
##  H0: No instantaneous causality between: Privat and DATO LT GLOB
##  Forretning Industri
## 
## data:  VAR object var_model_orig
## Chi-squared = 369.22, df = 5, p-value < 2.2e-16
granger_test_deseason <- causality(var_model_deseason, cause = "Privat") 
print(granger_test_deseason)
## $Granger
## 
##  Granger causality H0: Privat do not Granger-cause LT GLOB Forretning
##  Industri Date
## 
## data:  VAR object var_model_deseason
## F-Test = 11.082, df1 = 70, df2 = 6894, p-value < 2.2e-16
## 
## 
## $Instant
## 
##  H0: No instantaneous causality between: Privat and LT GLOB Forretning
##  Industri Date
## 
## data:  VAR object var_model_deseason
## Chi-squared = 368.06, df = 5, p-value < 2.2e-16

Granger causality H0: Privat do not Granger-cause LT GLOB Forretning Industri. Original Data: p-value << 0.05, and we reject the H0. This indicates that past values of Privat provide significant information that helps predict the other variables. Deseasoned Data: p-value << 0.05, and we reject the H0. This indicates that past values of Privat provide significant information that helps predict the other variables.

The consistency of result between the original and deseasoned data suggest that the predictive relationship is not only due to seasonal components, assuming the deseasoning has been done correctly.

We can not conclude that Granger causality implies causality. The Granger causality test if one variable can improve the prediction of another based on previous values, backshifting. This doesn’t directly mean that one variable causes changes in the other. It doesn’t either take in count any potential third factors that may influence both variables. What we might say is that the Granger causality suggests a predictive relationship, predictive causation but not true causation.

Task H

library(forecast)
rmse <- function(predicted, actual) {
  sqrt(mean((predicted - actual)^2))
}
# Function given in the assignment description
CV.2 <- function(data, model_fct, init_fold = 1096 - 90 * 5, h = 90, return_models = FALSE, ...) {
  
  fold_inds <- seq(init_fold, length(data) - h, by = h)
  rmse <- c()
  models <- list()
  
  for (i in seq_along(fold_inds)) {
    
    fold <- fold_inds[i]
    
    train <- data[1:(fold - 1)]
    test <- data[fold:(fold + h - 1)] 
    
    new_model <- model_fct(train, ...)
    models[[i]] <- new_model
    forecast <- forecast(new_model, h = h)
    rmse <- c(rmse, rmse(forecast$mean, test))
  }
  
  if (return_models) {
    return(list(rmse = rmse, model = models)) 
  } else {
    return(rmse)
  }
}
forecast_Privat <- CV.2(merged_data_ts$Privat, model_fct = auto.arima, return_models = TRUE)
print(forecast_Privat)
## $rmse
## [1] 132709.2 134839.4 127252.0 124381.1 118109.4 126121.2
## 
## $model
## $model[[1]]
## Series: train 
## ARIMA(0,1,3) 
## 
## Coefficients:
##          ma1      ma2      ma3
##       0.1671  -0.1609  -0.1134
## s.e.  0.0392   0.0403   0.0424
## 
## sigma^2 = 342282555:  log likelihood = -7240.02
## AIC=14488.04   AICc=14488.1   BIC=14505.91
## 
## $model[[2]]
## Series: train 
## ARIMA(1,1,2) 
## 
## Coefficients:
##          ar1      ma1      ma2
##       0.7144  -0.6325  -0.1835
## s.e.  0.0741   0.0776   0.0370
## 
## sigma^2 = 388116121:  log likelihood = -8298.15
## AIC=16604.3   AICc=16604.35   BIC=16622.69
## 
## $model[[3]]
## Series: train 
## ARIMA(0,1,3) 
## 
## Coefficients:
##          ma1     ma2      ma3
##       0.0948  -0.116  -0.1287
## s.e.  0.0347   0.035   0.0375
## 
## sigma^2 = 352241530:  log likelihood = -9275.83
## AIC=18559.66   AICc=18559.71   BIC=18578.52
## 
## $model[[4]]
## Series: train 
## ARIMA(0,1,3) 
## 
## Coefficients:
##          ma1      ma2      ma3
##       0.0976  -0.1222  -0.1251
## s.e.  0.0330   0.0328   0.0351
## 
## sigma^2 = 343342749:  log likelihood = -10277.44
## AIC=20562.87   AICc=20562.91   BIC=20582.14
## 
## $model[[5]]
## Series: train 
## ARIMA(1,1,2) 
## 
## Coefficients:
##          ar1      ma1      ma2
##       0.7336  -0.5838  -0.2310
## s.e.  0.0592   0.0614   0.0298
## 
## sigma^2 = 413128774:  log likelihood = -11382.49
## AIC=22772.98   AICc=22773.02   BIC=22792.63
## 
## $model[[6]]
## Series: train 
## ARIMA(1,1,3) 
## 
## Coefficients:
##          ar1      ma1      ma2      ma3
##       0.7009  -0.5360  -0.2070  -0.0558
## s.e.  0.0695   0.0757   0.0323   0.0347
## 
## sigma^2 = 430798405:  log likelihood = -12425.38
## AIC=24860.77   AICc=24860.82   BIC=24885.76
forecast_LT <- CV.2(merged_data_ts$LT, model_fct = auto.arima, return_models = TRUE)
print(forecast_LT)
## $rmse
## [1] 0.04906244 0.04468151 0.04468719 8.51650670 5.08579679 8.97213235
## 
## $model
## $model[[1]]
## Series: train 
## ARIMA(1,1,3) 
## 
## Coefficients:
##          ar1      ma1      ma2     ma3
##       0.7790  -0.8839  -0.1336  0.1150
## s.e.  0.0923   0.1014   0.0532  0.0515
## 
## sigma^2 = 4.353:  log likelihood = -1385.53
## AIC=2781.06   AICc=2781.16   BIC=2803.4
## 
## $model[[2]]
## Series: train 
## ARIMA(1,1,3) 
## 
## Coefficients:
##          ar1      ma1      ma2     ma3
##       0.7789  -0.8839  -0.1337  0.1150
## s.e.  0.0864   0.0949   0.0498  0.0482
## 
## sigma^2 = 3.816:  log likelihood = -1531.14
## AIC=3072.28   AICc=3072.36   BIC=3095.27
## 
## $model[[3]]
## Series: train 
## ARIMA(1,1,3) 
## 
## Coefficients:
##          ar1      ma1      ma2     ma3
##       0.7789  -0.8839  -0.1337  0.1150
## s.e.  0.0815   0.0895   0.0470  0.0455
## 
## sigma^2 = 3.398:  log likelihood = -1671.22
## AIC=3352.43   AICc=3352.5   BIC=3376
## 
## $model[[4]]
## Series: train 
## ARIMA(1,1,3) 
## 
## Coefficients:
##          ar1      ma1      ma2     ma3
##       0.7789  -0.8839  -0.1338  0.1150
## s.e.  0.0774   0.0850   0.0447  0.0432
## 
## sigma^2 = 3.062:  log likelihood = -1806.37
## AIC=3622.73   AICc=3622.8   BIC=3646.82
## 
## $model[[5]]
## Series: train 
## ARIMA(1,1,3) 
## 
## Coefficients:
##          ar1      ma1      ma2     ma3
##       0.8192  -0.8774  -0.1437  0.1014
## s.e.  0.0663   0.0744   0.0413  0.0393
## 
## sigma^2 = 3.483:  log likelihood = -2049.16
## AIC=4108.33   AICc=4108.39   BIC=4132.89
## 
## $model[[6]]
## Series: train 
## ARIMA(1,1,3) 
## 
## Coefficients:
##          ar1      ma1      ma2     ma3
##       0.7834  -0.8214  -0.1660  0.0801
## s.e.  0.0618   0.0697   0.0384  0.0386
## 
## sigma^2 = 3.562:  log likelihood = -2245.26
## AIC=4500.52   AICc=4500.57   BIC=4525.5
forecast_GLOB <- CV.2(merged_data_ts$GLOB, model_fct = auto.arima, return_models = TRUE)
print(forecast_GLOB)
## $rmse
## [1] 1.068597 1.143652 1.143603 1.315017 7.816510 9.783908
## 
## $model
## $model[[1]]
## Series: train 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.3243  -0.8748
## s.e.  0.0461   0.0217
## 
## sigma^2 = 20.18:  log likelihood = -1880.72
## AIC=3767.44   AICc=3767.48   BIC=3780.85
## 
## $model[[2]]
## Series: train 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.3243  -0.8748
## s.e.  0.0431   0.0203
## 
## sigma^2 = 17.7:  log likelihood = -2095.51
## AIC=4197.02   AICc=4197.06   BIC=4210.82
## 
## $model[[3]]
## Series: train 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.3242  -0.8748
## s.e.  0.0407   0.0192
## 
## sigma^2 = 15.76:  log likelihood = -2304.78
## AIC=4615.56   AICc=4615.59   BIC=4629.7
## 
## $model[[4]]
## Series: train 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.3242  -0.8747
## s.e.  0.0387   0.0182
## 
## sigma^2 = 14.21:  log likelihood = -2509.13
## AIC=5024.26   AICc=5024.29   BIC=5038.71
## 
## $model[[5]]
## Series: train 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.3233  -0.8746
## s.e.  0.0369   0.0174
## 
## sigma^2 = 12.96:  log likelihood = -2710.11
## AIC=5426.21   AICc=5426.24   BIC=5440.95
## 
## $model[[6]]
## Series: train 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.3121  -0.8693
## s.e.  0.0359   0.0174
## 
## sigma^2 = 13.18:  log likelihood = -2962.35
## AIC=5930.69   AICc=5930.72   BIC=5945.69
rmse_results <- data.frame(
  Variable = c("LT", "GLOB"),
  RMSE = c(mean(forecast_LT$rmse), mean(forecast_GLOB$rmse))
)

ggplot(rmse_results, aes(x = Variable, y = RMSE)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Average RMSE for LT and GLOB", y = "Average RMSE", x = "Variable")

rmse_resultsP <- data.frame(
  Variable = c("Privat"),
  RMSE = c(mean(forecast_Privat$rmse))
)

ggplot(rmse_resultsP, aes(x = Variable, y = RMSE)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Average RMSE for Privat", y = "Average RMSE", x = "")

The RMSE values for “Privat” are significantly higher compared to “GLOB” and “LT.” This large discrepancy in scale comes from the huge difference in the scaling of the data itself.

Discuss the limitation of forecasting on datasets with two years of data when there is a yearly seasonal period:

With just two years of data, there are only two full cycles of the yearly seasonality. This is generally insufficient to capture the variability of seasonal patterns over time. For instance, weather, electricity consumption, or other factors with annual seasonality can be influenced by broader, multi-year cycles or irregularities that may not be visible in such a short dataset. And in many time series, long-term trends coexist with seasonal patterns. However, identifying and separating these trends reliably from the seasonality is challenging with only two years of data. This can lead to inaccurate forecasts.

all_data_imputed$DATO <- as.Date(all_data_imputed$DATO)

# Convert the data to time series format
lt_ts <- ts(all_data_imputed$LT, start = c(2017, 1), frequency = 365)    # Daily data for LT
glob_ts <- ts(all_data_imputed$GLOB, start = c(2017, 1), frequency = 365)  # Daily data for GLOB

# Forecast LT (Air Temperature)
lt_model <- auto.arima(lt_ts)
lt_forecast <- forecast(lt_model, h = 365)  # Forecast for the next year (365 days)

# Forecast GLOB (Global Irradiation)
glob_model <- auto.arima(glob_ts)
glob_forecast <- forecast(glob_model, h = 365)  # Forecast for the next year (365 days)

autoplot(lt_forecast) + 
  labs(title = "Forecast of Air Temperature (LT) for the Next Year",
       x = "Date", y = "Air Temperature (LT)")

autoplot(glob_forecast) + 
  labs(title = "Forecast of Global Irradiation (GLOB) for the Next Year",
       x = "Date", y = "Global Irradiation (GLOB)")

Briefly explain the difference between using cross-validation on time series data and non-time series data:

In cross-validation for non-time series data, data is randomly split into folds because observations are independent. In contrast, for time series data, cross-validation must respect the temporal order, ensuring that training is done on past data to predict future data, as future values cannot be used for training. This prevents data leakage and maintains the temporal dependency in the model.

lt_residuals <- residuals(lt_model)
shapiro_lt <- shapiro.test(lt_residuals)
print(shapiro_lt)
## 
##  Shapiro-Wilk normality test
## 
## data:  lt_residuals
## W = 0.96945, p-value < 2.2e-16
plot(lt_residuals)

The residuals from our ARIMA model show significant non-normality (as indicated by the Shapiro-Wilk test) and some large spikes, suggesting potential outliers. While the variance seems fairly stable over time, the model may not fully capture all patterns in the data, possibly missing seasonal or autocorrelated components.4

glob_residuals <- residuals(glob_model)
shapiro_glob <- shapiro.test(glob_residuals)
print(shapiro_glob)
## 
##  Shapiro-Wilk normality test
## 
## data:  glob_residuals
## W = 0.907, p-value < 2.2e-16
plot(glob_residuals)

The residual plot shows a recurring pattern with varying amplitudes over time, suggesting seasonality or structural shifts in the data. This pattern could also indicate that the model is not fully capturing the underlying seasonality or other cyclic behavior in the data.

Part 3

Task I

library(forecast)
library(ggplot2)

# Convert data to time series format
lt_ts <- ts(all_data_imputed$LT, start = c(2017, 1), frequency = 365)    # Daily data for LT (Air Temperature)
glob_ts <- ts(all_data_imputed$GLOB, start = c(2017, 1), frequency = 365)  # Daily data for GLOB (Global Irradiation)

# Fit an ARIMAX model to forecast GLOB using LT as an exogenous variable
glob_arimax_model <- auto.arima(glob_ts, xreg = lt_ts)

# Summarize the ARIMAX model for GLOB
summary(glob_arimax_model)
## Series: glob_ts 
## Regression with ARIMA(5,0,0) errors 
## 
## Coefficients:
##          ar1     ar2     ar3     ar4     ar5  intercept   xreg
##       0.4237  0.0993  0.1282  0.1135  0.1386     9.0116  0.190
## s.e.  0.0217  0.0235  0.0233  0.0234  0.0217     1.0955  0.044
## 
## sigma^2 = 22.36:  log likelihood = -6275.43
## AIC=12566.85   AICc=12566.92   BIC=12612.1
## 
## Training set error measures:
##                       ME     RMSE     MAE       MPE    MAPE      MASE
## Training set 0.000368663 4.721149 3.43121 -65.61255 89.7397 0.7363509
##                     ACF1
## Training set -0.01887873
# Forecast for the next 365 days using the ARIMAX model for GLOB
lt_forecast <- forecast(auto.arima(lt_ts), h = 365)  # Forecast future LT values
future_lt <- lt_forecast$mean  # Forecasted LT values to use as exogenous input

# Perform ARIMAX forecast on GLOB with future LT values
glob_forecast_arimax <- forecast(glob_arimax_model, xreg = future_lt, h = 365)

# Plot the ARIMAX forecast for GLOB
autoplot(glob_forecast_arimax) +
  labs(title = "ARIMAX Forecast of Global Irradiation (GLOB) using Air Temperature (LT) as Exogenous Variable",
       x = "Date", y = "Global Irradiation (GLOB)")

# Fit an ARIMAX model to forecast LT using GLOB as an exogenous variable
lt_arimax_model <- auto.arima(lt_ts, xreg = glob_ts)

# Summarize the ARIMAX model for LT
summary(lt_arimax_model)
## Series: lt_ts 
## Regression with ARIMA(5,0,0) errors 
## 
## Coefficients:
##          ar1      ar2     ar3     ar4     ar5  intercept    xreg
##       0.9587  -0.1480  0.0915  0.0141  0.0514     6.5302  0.0564
## s.e.  0.0218   0.0304  0.0304  0.0302  0.0218     1.4675  0.0090
## 
## sigma^2 = 4.898:  log likelihood = -4672.32
## AIC=9360.64   AICc=9360.71   BIC=9405.88
## 
## Training set error measures:
##                        ME   RMSE      MAE       MPE     MAPE      MASE
## Training set 0.0002582396 2.2094 1.649087 -112.9453 181.8112 0.4168356
##                      ACF1
## Training set -0.002315457
# Forecast for the next 365 days using the ARIMAX model for LT
glob_forecast <- forecast(auto.arima(glob_ts), h = 365)  # Forecast future GLOB values
future_glob <- glob_forecast$mean  # Forecasted GLOB values to use as exogenous input

# Perform ARIMAX forecast on LT with future GLOB values
lt_forecast_arimax <- forecast(lt_arimax_model, xreg = future_glob, h = 365)

# Plot the ARIMAX forecast for LT
autoplot(lt_forecast_arimax) +
  labs(title = "ARIMAX Forecast of Air Temperature (LT) using Global Irradiation (GLOB) as Exogenous Variable",
       x = "Date", y = "Air Temperature (LT)")

Detrended Arimax Using LT as Exogenous Variable

# Detrend LT data using STL decomposition
lt_ts_detrended <- stl(lt_ts, s.window = "periodic")$time.series[, 2]

# Fit ARIMAX model with detrended LT as exogenous variable
glob_arimax_model_detrended <- auto.arima(glob_ts, xreg = lt_ts_detrended)

# Forecast for the next 365 days using the ARIMAX model
# 
lt_forecast_detrended <- forecast(auto.arima(lt_ts), h = 365)  # Forecast future LT
future_lt_detrended <- lt_forecast_detrended$mean  # Extract forecasted LT values

# Forecast GLOB using the forecasted future LT values
glob_forecast_arimax_detrended <- forecast(glob_arimax_model_detrended, xreg = future_lt_detrended, h = 365)

autoplot(glob_forecast_arimax_detrended) +
  labs(title = "ARIMAX Forecast of Global Irradiation (GLOB) using Detrended Air Temperature (LT) as Exogenous Variable",
       x = "Date", y = "Global Irradiation (GLOB)")

detrended arimax for both?

lt_ts_detrended <- stl(lt_ts, s.window = "periodic")$time.series[, 2]

glob_arimax_model_detrended <- auto.arima(glob_ts, xreg = lt_ts_detrended)

lt_forecast_detrended <- forecast(auto.arima(lt_ts), h = 365)  
future_lt_detrended <- lt_forecast_detrended$mean  

glob_forecast_arimax_detrended <- forecast(glob_arimax_model_detrended, xreg = future_lt_detrended, h = 365)

lt_forecast_plot <- autoplot(lt_forecast_detrended) +
  labs(title = "Forecast of Air Temperature (LT)",
       x = "Date", y = "Air Temperature (LT)")

glob_forecast_plot <- autoplot(glob_forecast_arimax_detrended) +
  labs(title = "ARIMAX Forecast of Global Irradiation (GLOB) using Detrended Air Temperature (LT) as Exogenous Variable",
       x = "Date", y = "Global Irradiation (GLOB)")

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(lt_forecast_plot, glob_forecast_plot, ncol = 2)

Using exogenous variables like air temperature (LT) in an ARIMAX model can enhance forecasting by capturing the influence of external factors on the target variable, such as global irradiation (GLOB). This helps model the relationship between variables, leading to more accurate predictions. However, it introduces complexity, as too many regressors can lead to overfitting, and the quality of external data is crucial for model performance. Additionally, ARIMAX assumes linear relationships, which may not always hold.

Detrending the data, such as through STL decomposition, removes long-term trends and isolates seasonal and cyclical behavior, making the data more stationary and easier to forecast. This can improve the model by focusing on short-term fluctuations. However, detrending may also obscure important long-term trends, especially if these trends are critical for the forecast. Moreover, incorrect detrending or over-differencing can distort the data, leading to poor model performance.